Rem
Rem $Header: rdbms/demo/dmstardemo.sql /main/1 2012/04/15 16:31:57 xbarr Exp $
Rem
Rem dmstardemo.sql
Rem
Rem Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 
Rem
Rem    NAME
Rem      dmstardemo.sql - Sample program for the DBMS_DATA_MINING package
Rem
Rem    DESCRIPTION
Rem      This script creates a clustering model
Rem      using the K-Means algorithm
Rem      and data in a SH (Sales History) star schema in the RDBMS.
Rem
Rem    NOTES
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    amozes      03/02/12 - dm demo with star schema
Rem    amozes      03/02/12 - Created
Rem

SET ECHO ON
SET FEEDBACK 1
SET NUMWIDTH 10
SET LINESIZE 80
SET TRIMSPOOL ON
SET TAB OFF
SET PAGESIZE 100


-------------------
-- STAR SCHEMA
-- Bring together the data in the sh star schema.
-- For mining, include the customer demographics, but also include the
-- per-subcategory purchase amounts that were made for each customer.
-- This will enhance the clustering model to account for customer
-- behavior as well as demographics.
create or replace view cust_with_sales as
select c.*, v2.per_subcat_sales from
sh.customers c,
(select v.cust_id, 
        cast(collect(dm_nested_numerical(v.prod_subcategory, v.sum_amount_sold)) 
             as dm_nested_numericals) per_subcat_sales
 from 
 (select s.cust_id, p.prod_subcategory, sum(s.amount_sold) sum_amount_sold
  from sh.sales s, sh.products p
  where s.prod_id = p.prod_id
  group by s.cust_id, p.prod_subcategory) v
 group by v.cust_id) v2
where c.cust_id = v2.cust_id;

-------------------
-- SPECIFY SETTINGS
--
-- Cleanup old settings table for repeat runs
BEGIN EXECUTE IMMEDIATE 'DROP TABLE dm_star_set';
EXCEPTION WHEN OTHERS THEN NULL; END;
/
set echo off
create table dm_star_set (setting_name varchar2(30), setting_value varchar2(4000));
set echo on
BEGIN       
   INSERT INTO dm_star_set (setting_name, setting_value) VALUES 
   (dbms_data_mining.prep_auto,dbms_data_mining.prep_auto_on);
END;
/
commit;

---------------------
-- CREATE A NEW MODEL
--
-- Cleanup old model with same name for repeat runs
BEGIN DBMS_DATA_MINING.DROP_MODEL('DM_STAR_CLUSTER');
EXCEPTION WHEN OTHERS THEN NULL; END;
/
declare
  xform_list dbms_data_mining_transform.TRANSFORM_LIST;
begin
  -- Transform the two dates to a numeric duration
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_RETAIN_DUR', NULL, 'CUST_EFF_TO - CUST_EFF_FROM', NULL);

  -- Transform the country to a categorical attribute since
  -- numeric datatypes are treated as numeric attributes.
  dbms_data_mining_transform.set_transform(xform_list,
    'COUNTRY_ID', NULL, 'TO_CHAR(COUNTRY_ID)', NULL);

  -- Eliminate columns known to be uninteresting,
  -- which will speed up the process.
  -- Alternatively, you can do this when creating the view.
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_EFF_TO', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_EFF_FROM', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_CITY_ID', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_STATE_PROVINCE_ID', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_STREET_ADDRESS', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_FIRST_NAME', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_LAST_NAME', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_MAIN_PHONE_NUMBER', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_EMAIL', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_TOTAL_ID', NULL, NULL, NULL);
  dbms_data_mining_transform.set_transform(xform_list,
    'CUST_SRC_ID', NULL, NULL, NULL);

  -- perform the build
  DBMS_DATA_MINING.CREATE_MODEL(
    model_name          => 'DM_STAR_CLUSTER',
    mining_function     => dbms_data_mining.clustering,
    data_table_name     => 'cust_with_sales',
    case_id_column_name => 'cust_id',
    settings_table_name => 'dm_star_set',
    xform_list          => xform_list);
end;
/

-------------------------
-- DISPLAY MODEL SETTINGS
--
column setting_name format a30
column setting_value format a30
SELECT setting_name, setting_value
  FROM user_mining_model_settings
 WHERE model_name = 'DM_STAR_CLUSTER'
ORDER BY setting_name;

--------------------------
-- DISPLAY MODEL SIGNATURE
--
column attribute_name format a30
column attribute_type format a20
column data_type format a20
SELECT attribute_name, attribute_type, data_type
  FROM user_mining_model_attributes
 WHERE model_name = 'DM_STAR_CLUSTER'
ORDER BY attribute_name;


------------------------
-- DISPLAY MODEL DETAILS
--
-- Cluster details are best seen in pieces - based on the kind of
-- associations and groupings that are needed to be observed.
--
-- CLUSTERS
-- For each cluster_id, provides the number of records in the cluster,
-- the parent cluster id, the level in the hierarchy, and dispersion -
-- which is a measure of the quality of the cluster, and computationally,
-- the sum of square errors.
--
SELECT id           clu_id,
       record_count rec_cnt,
       parent       parent,
       tree_level   tree_level
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_KM('DM_STAR_CLUSTER',null,null,0,0,0))
 ORDER BY id;

-- TAXONOMY
--
SELECT T.id clu_id, C.id child_id
 FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_KM('DM_STAR_CLUSTER',null,null,0,0,0)) T,
      TABLE(T.child) C
 ORDER BY T.id, C.id;

-- CENTROIDS FOR LEAF CLUSTERS
-- For cluster_id 16, this output lists all the attributes that
-- constitute the centroid, with the mean (for numericals) or
-- mode (for categoricals)
-- Note that per-subcategory sales for each customer are being
-- considered when creating clusters.
--
column aname format a60
column mode_val format a40
column mean_val format 9999999
SELECT NVL2(C.attribute_subname, 
            C.attribute_name || '.' || C.attribute_subname, 
            C.attribute_name) aname,
       C.mean mean_val,
       C.mode_value mode_val
  FROM TABLE(DBMS_DATA_MINING.GET_MODEL_DETAILS_KM('DM_STAR_CLUSTER',16,null,1,0,0)) T,
       TABLE(T.centroid) C
ORDER BY aname;

-------------------------------------------------
-- SCORE NEW DATA USING SQL DATA MINING FUNCTIONS
--
------------------
-- BUSINESS CASE 1
-- List the clusters into which the customers in this
-- given dataset have been grouped.
--
SELECT CLUSTER_ID(DM_STAR_CLUSTER USING *) AS clus, COUNT(*) AS cnt 
  FROM cust_with_sales
GROUP BY CLUSTER_ID(DM_STAR_CLUSTER USING *)
ORDER BY cnt DESC;
--
------------------
-- BUSINESS CASE 2
-- List the five most relevant attributes for likely cluster assignments
-- for customer id 100955 (> 20% likelihood of assignment).
--
column prob format 9.9999
set line 150
set long 10000
SELECT S.cluster_id, probability prob, 
       CLUSTER_DETAILS(DM_STAR_CLUSTER, S.cluster_id, 5 using T.*) det
FROM 
  (SELECT v.*, CLUSTER_SET(DM_STAR_CLUSTER, NULL, 0.2 USING *) pset
    FROM cust_with_sales v
   WHERE cust_id = 100955) T, 
  TABLE(T.pset) S
order by 2 desc;
